import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn.metrics import precision_score ,recall_score, f1_score
from sklearn.metrics import accuracy_score
from plotly.offline import iplot, init_notebook_mode,download_plotlyjs
import plotly.graph_objects as go
import plotly.express as px
#use this code to be able to display all the output in the cell instead of only displaying the out put for the last one. see the next cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df_prod=pd.read_csv('FAOSTAT_crop_production.csv') # Total yearly crop production in tonnes by country
df_reg =pd.read_csv('regional_code.csv') # regions for the country data
df_crop =pd.read_csv('FAOSTAT_data_main_crop_yield.csv') # crop yield per hectare
df_temp = pd.read_csv('global_temp_change.csv')
df_crop.head()
from pandas_profiling import ProfileReport
crop_profile = ProfileReport(df_crop)
crop_profile
df_crop= df_crop.dropna()
df_crop.columns
#change the column name
df_crop.rename(columns={'Value':'yield'}, inplace=True)
df_crop.columns
We only need the following features from the data. the rest are either constant or irrelevant
[ 'Area Code', 'Area','Item Code', 'Item', 'Year', 'Value']
df_crop = df_crop[['Area Code', 'Area','Item Code', 'Item', 'Year', 'yield']]
df_crop.tail()
# def getFiltered(df,col):
# df_sub=df.query("Item = {}".format(col))
# df_sub_max = df_sub.groupby(['Year'], as_index = False)['Value'].max()
# return df_sub_max
df_crop['Item'].unique()
df_crop['Item'] = df_crop['Item'].str.replace('Rice, paddy', 'Rice')
df_crop['Item'].unique()
df_crop.head()
#split the data into two groups by year
df_crop_early = df_crop.query("Year in [2010,2011,2012,2013,2014]")
df_crop_late = df_crop.query("Year in [2015,2016,2017,2018,2019]")
# group by crop type to see the difference by crop
df_crop_early_item = df_crop_early.groupby(['Item'], as_index =False)['yield'].sum()
df_crop_late_item = df_crop_late.groupby(['Item'], as_index =False)['yield'].sum()
df_crop_early_item.head()
#merge the two groups
df_crop_early_late_item = pd.merge(df_crop_early_item, df_crop_late_item , on='Item',suffixes=('_early', '_late'))
df_crop_early_late_item.head()
#plot the two grops
df_crop_early_late_item.set_index('Item', inplace=True)
df_crop_early_late_item.plot.bar()
plt.ylabel('Yield in tonnes')
plt.xlabel('Crop Type')
plt.title('Yield (2011-2014) vs (2015-2019)')
plt.show()
#calculate the rate of decrease
df_crop_early_late_item['roi'] = (df_crop_early_late_item['yield_late']-df_crop_early_late_item['yield_early'])/(df_crop_early_late_item['yield_late'])
df_crop_early_late_item['roi'] =df_crop_early_late_item['roi']*100
df_crop_early_late_item.head()
df_crop_early_late_item.reset_index(inplace =True)
# plot the rate of change
fig = px.bar(df_crop_early_late_item, x="Item", y= "roi").update_xaxes(categoryorder="max descending")
fig.update_layout(xaxis_tickangle = 45, title = 'Rate of Yield decrease early(2011-2014 ) vs late(2015-2019) by Crop' )
fig.show()
In order to make analysis by different regions we need the data with regions.
df_reg.head()
# we only need the country code and group
df_region = df_reg[['Country Group','Country Code']]
df_region.head()
df_crop_reg = pd.merge(df_crop, df_region, how='left', left_on=['Area Code'], right_on=['Country Code'])
df_crop_reg.head()
# df_crop_reg_profile = ProfileReport(df_crop_reg)
# df_crop_reg_profile
df_reg= df_reg.dropna()
df_crop_reg = df_crop_reg.dropna()
#rename the Country Group column
df_crop_reg.rename(columns={"Country Group": "region"} , inplace = True)
df_crop_reg['region'].unique()
df_crop_reg_early = df_crop_reg.query("Year in [2010,2011,2012,2013,2014]")
df_crop_reg_late = df_crop_reg.query("Year in [2015,2016,2017,2018,2019]")
df_crop_reg_early_r = df_crop_reg_early.groupby(['region'], as_index =False)['yield'].sum()
df_crop_reg_late_r = df_crop_reg_late.groupby(['region'], as_index =False)['yield'].sum()
df_crop_reg_early_r.head()
df_crop_reg_early_late_r = pd.merge(df_crop_reg_early_r, df_crop_reg_late_r , on='region',suffixes=('_early', '_late'))
#df_crop_reg_early_late_r.set_index('region', inplace=True)
df_crop_reg_early_late_r.head()
import plotly.graph_objects as go
regions=df_crop_reg_early_late_r['region']
early = df_crop_reg_early_late_r['yield_early']
late =df_crop_reg_early_late_r['yield_late']
fig = go.Figure(data=[
go.Bar(name='early', x=regions, y=early),
go.Bar(name='late', x=regions, y=late)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle = 45, title = 'Yield early(2011-2014 ) vs late(2015-2019)' )
fig.show()
# calculate the rate of decrease
df_crop_reg_early_late_r['roi'] = (df_crop_reg_early_late_r['yield_late']-df_crop_reg_early_late_r['yield_early'])/(df_crop_reg_early_late_r['yield_late'])
df_crop_reg_early_late_r['roi'] =df_crop_reg_early_late_r['roi']*100
df_crop_reg_early_late_r.head()
df_crop_reg_early_late_r.nlargest(10,['roi'])
fig = px.bar(df_crop_reg_early_late_r, x="region", y= "roi").update_xaxes(categoryorder="max descending")
fig.update_layout(barmode='group', xaxis_tickangle = 45,
title = 'Rate of Yield decrease early(2011-2014 ) vs late(2015-2019)',
yaxis_title="Percentage Rate of Increase")
fig.show()
df_temp.head()
df_temp.columns
df_temp.rename(columns={'Value':'temperature'}, inplace =True)
df_temp.columns
We only need the following features from the data. the rest are either constant or irrelevant
[ 'Area Code', 'Area','Months Code', 'Months', 'Year', 'Value']
df_temp = df_temp[[ 'Area Code', 'Area','Months Code', 'Months', 'Year', 'temperature']]
df_temp.head()
df_temp.isna().sum()
df_temp=df_temp.dropna()
df_temp_metro = df_temp.query("Months == 'Meteorological year'")
df_temp_metro.head()
df_temp_metro_2016 =df_temp_metro.query("Year == '2016' ")
df_temp_metro_2016.head()
df_crop_reg_sorted=df_crop_reg.sort_values(by=['Year'])
df_temp_sorted=df_temp.sort_values(by=['Year'])
df_crop_temp = pd.merge_asof(df_temp_sorted, df_crop_reg_sorted,
on='Year',
by='Area')
df_crop_temp.head()
df_crop_reg.head()
df_crop_temp.isna().sum()
df_crop_temp= df_crop_temp.dropna()
df_crop_temp.shape
df_crop_temp.isna().sum()
# query by name of the crop
product =df_crop_temp['Item'].unique()
for prod in product:
globals()[prod] = df_crop_temp.loc[df_crop_temp['Item'] == prod]
Maize_max =Maize.groupby(['Year'], as_index = False)['yield'].max()
Wheat_max =Wheat.groupby(['Year'], as_index = False)['yield'].max()
Rice_max =Rice.groupby(['Year'], as_index = False)['yield'].max()
# df_crop_temp.rename(columns={"Country Group": "region"} , inplace = True)
# df_crop_temp['region'].unique()
df_crop_temp['region'] = df_crop_temp['region'].str.replace(' ','_')
df_crop_temp['region'].unique()
df_crop_temp['region'] = df_crop_temp['region'].str.replace('-','_')
df_crop_temp['region'].unique()
regions =df_crop_temp['region'].unique()
for reg in regions:
globals()[reg] = df_crop_temp.loc[df_crop_temp['region'] == reg]
regions
Africa.head()
Africa_maize = Africa.query("Item=='Maize'")
Africa_maize_max = Africa_maize.groupby(['Year'], as_index=False)['yield'].max()
Africa_rice = Africa.query("Item=='Rice'")
Africa_rice_max = Africa_rice.groupby(['Year'], as_index=False)['yield'].max()
Africa_wheat = Africa.query("Item=='Wheat'")
Africa_wheat_max = Africa_wheat.groupby(['Year'], as_index=False)['yield'].max()
df_crop_temp['temp']= df_crop_temp.temperature.apply(lambda x : 'low' if (x>=0 and x<=2) else 'high')
df_crop_temp.head()
df_crop_temp_high = df_crop_temp.query("temp=='high'")
df_crop_temp_low = df_crop_temp.query("temp=='low'")
df_crop_temp_high_i = df_crop_temp_high.groupby(['Item'], as_index =False)['yield'].sum()
df_crop_temp_low_i = df_crop_temp_low.groupby(['Item'], as_index =False)['yield'].sum()
df_crop_temp_low_i.head()
df_crop_temp_high_low_i = pd.merge(df_crop_temp_high_i, df_crop_temp_low_i , on='Item',suffixes=('_high', '_low'))
#df_crop_reg_early_late_r.set_index('region', inplace=True)
df_crop_temp_high_low_i.head()
import plotly.graph_objects as go
regions=df_crop_temp_high_low_i['Item']
high = df_crop_temp_high_low_i['yield_high']
low =df_crop_temp_high_low_i['yield_low']
fig = go.Figure(data=[
go.Bar(name='high', x=regions, y=high),
go.Bar(name='low', x=regions, y=low)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle = 45, title = 'Yield in high vs low temperature Anomaly',
xaxis_title="Crop Type",
yaxis_title="Yield per Hectare")
fig.show()
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
from scipy import stats
# def GetByReg(df):
# product =df_crop['Item'].unique()
# for prod in product:
# globals()[prod] = df_crop.loc[df_crop['Item'] == prod]
# return globals()[prod]
df_crop_temp.head()
#query the yearly temperature anomaly only
df_crop_temp_metro = df_crop_temp.query("Months == 'Meteorological year'")
df_crop_temp_metro.head()
df_high_temp = df_crop_temp_metro.query( "temperature > 1")
df_low_temp =df_crop_temp_metro.query( "temperature > 0 and temperature < = 1")
df_low_temp.head()
df_high_temp_ave = df_high_temp.groupby(['temperature','region'], as_index =False)['yield'].mean()
df_low_temp_ave =df_low_temp.groupby(['temperature','region'], as_index =False)['yield'].mean()
df_low_temp_ave.head()
import plotly.graph_objects as go
fig = go.Figure()
trace1 = go.Scatter(x=df_high_temp["Year"], y=df_high_temp["yield"], mode="lines+markers", name="high temp")
fig.add_trace(trace1)
trace2 = go.Scatter(x=df_low_temp["Year"], y=df_low_temp["yield"], mode="lines+markers", name="Low temp")
fig.add_trace(trace2)
fig.update_layout(
title={
"text": "Global Average Yield in High & Low Temperature Anomaly",
"x":0.5,
"xanchor": "center"
},
xaxis_title="Year",
yaxis_title="Yield per Hectare")
fig.show()
Rice_max =Rice.groupby(['Year'], as_index = False)['yield'].max()
# To check if the data is normally distributed using Shapiro test
def NormalityCheck(df):
result = stats.shapiro(df) # to test if the discounted group distribution is normal
print('The test stattistics is: {}' .format(result))
if result[1] < 0.05:
print('The test statistic is less than .05 , the null hypothesis is rejected and the data is not normally distributed')
else:
print('The test statistic is greater than .05 , we fail to reject the null hypothesis and the data is normally distributed')
print('The size of the data is : {}'.format(len(df)))
def NormalityPlot(df,df1,df2):
plt.figure(figsize=(20, 10))
sns.distplot(df, label='population data')
sns.distplot(df1, label='high temp')
sns.distplot(df2, label='low temp')
plt.title('Distribution of temprature', fontsize=16)
plt.xlabel('temp', fontsize=16)
plt.legend(frameon=False, fontsize=16, loc='best')
plt.show()
#Testing for equal variance between the two groups by using the levene's test
def EqualVarianceCheck(df1,df2):
print ('Hypothesis Testing for Equal Variance' '\n\n' )
print('H0: The two variances are equal')
print('HA: The two variances are not equal' '\n')
print ('The sizes of the data are {} and {}' '\n' .format(len(df1), len(df2)))
result_l = stats.levene(df1, df2) # to test normality
print('{}' '\n' .format(result_l))
if result_l[1] < 0.05:
print('The test statistic is less than .05 , the null hypothesis is rejected and the two variances are not the same')
else:
print('The test statistic is greater than .05 , we fail to reject the null hypothesis and the two variances are the same')
def WelchsTestVarTrue(df1,df2):
result_w = stats.ttest_ind(dfl, df2)
print('{}' '\n' .format(result_w))
if result_w[1] < 0.05:
print('The test statistic is less than .05 , the null hypothesis is rejected and Temperature has an effect on the crop yield')
else:
print('The test statistic is greater than .05 , we fail to reject the null hypothesis and temprature has an effect on crop yield')
def Cohen_d(group1, group2):
diff = group1.mean() - group2.mean()
n1, n2 = len(group1), len(group2)
var1 = group1.var()
var2 = group2.var()
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
# Calculate Cohen's d statistic
d = diff / np.sqrt(pooled_var)
return round(abs(d), 1)
df_crop_temp_m = df_crop_temp.query("Months == 'Meteorological year'")
df_crop_temp_m_h = df_crop_temp_m.query( "temperature > 0")
df_crop_temp_m_l = df_crop_temp_m.query( "temperature <= 0")
df_crop_temp_m_h_reg =df_crop_temp_m_h.groupby(['region'],as_index = False).mean()
df_crop_temp_m_l_reg =df_crop_temp_m_l.groupby(['region'],as_index = False).mean()
df_crop_temp_m_l_reg.head()
df_crop_temp_m_l.shape
df_crop_temp_m_h.shape
df_crop_temp_m_h.head()
EqualVarianceCheck(df_crop_temp_m_l['yield'],df_crop_temp_m_h['yield'])
Since they don't have the same variance we need to use Welch's t test
result_w = stats.ttest_ind(df_crop_temp_m_l['yield'],df_crop_temp_m_h['yield'], equal_var=False)
print('{}' '\n' .format(result_w))
if result_w[1] < 0.05:
print('The test statistic is less than .05 , the null hypothesis is rejected and Temperature has an effect on the crop yield')
else:
print('The test statistic is greater than .05 , we fail to reject the null hypothesis and temprature has an effect on crop yield')
df_crop_temp_m_h_reg.head()
df_crop_temp_m_h_reg.shape
df_crop_temp_m_h_reg.isna().sum()
df_crop_temp_m_h.head()
df_crop_temp_m_h.isna().sum()
# # ANOVA test production vs country group
# from statsmodels.formula.api import ols
# formula ='yield ~ C(region)'
# lm = ols(formula, df_crop_temp_m_h).fit()
# table = sm.stats.anova_lm(lm, typ=2)
# print(table)
The p value is less than 0.05. we reject the null hypothesis.Temperature has different effect for different regions. what is the effect size?
df_crop_temp_m_l.head()
regions
for i in regions:
d= Cohen_d(df_crop_temp_m_l[df_crop_temp_m_l['region'] ==i]['yield'], df_crop_temp_m_h[df_crop_temp_m_h['region'] == i]['yield'])
print (f'effect size for {i} is {d}' )
regions=['Low_Income_Food_Deficit_Countries', 'Eastern_Africa', 'High_income_economies','Northern_America','Northern_Africa','Central_Asia',' Australia_and_New_Zealand' ]
d=[0.5,0.4,0.2,0,0.9,0,0.6]
import plotly.graph_objects as go
regions=['Low_Income_Food_Deficit_Countries', 'Eastern_Africa', 'High_income_economies','Northern_America','Northern_Africa','Central_Asia',' Australia_and_New_Zealand' ]
d=[0.5,0.4,0.2,0,0.9,0,0.6]
fig = go.Figure(data=[
go.Bar(x=regions, y=d),
]).update_layout(barmode='group', xaxis_tickangle = 45, title= 'Temperature Effect on Crop Yield by Region', yaxis_title = 'Effect size')
fig.show()
# # ANOVA test production vs country group
# formula = 'yield ~ C(Item)'
# lm = ols(formula, df_crop_temp_m_h).fit()
# table = sm.stats.anova_lm(lm, typ=2)
# print(table)
item_list = df_crop_temp_m_h.Item.unique()
for i in item_list:
d= Cohen_d(df_crop_temp_m_l[df_crop_temp_m_l['Item'] ==i]['yield'], df_crop_temp_m_h[df_crop_temp_m_h['Item'] == i]['yield'])
print (f'effect size for {i} is {d}' )
import plotly.graph_objects as go
crops = ['Rice', 'Maize','Wheat','Barly','Millet','Sorghum']
fig = go.Figure(data=[
go.Bar(x=crops, y=[0.3,0.2,0.3,0.2,0.1,0.2]),
]).update_layout(barmode='group', xaxis_tickangle = 45, title= 'Temperature Effect on Crop Yield by Crop type')
fig.show()